import os
from IPython.display import IFrame
import pandas as pd
import sys
sys.path.insert(0, '../..')
from JKBio.epigenetics import ChIP_helper as chiphelper
from JKBio import Helper as helper
import igv
import numpy as np
import pyBigWig
import itertools
import seaborn as sns
from scipy import stats
from matplotlib import cm
from scipy.cluster.hierarchy import linkage, leaves_list
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from bokeh.plotting import *
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from pybedtools import BedTool
output_notebook()
%load_ext autoreload
%autoreload 2
! gcsfuse --only-dir Chip_AML jkobject data/seqs
singleend, pairedend = chiphelper.extractPairedSingleEndFrom('data/seqs')


! nextflow cloud create 'JKcluster' -c 4
! nextflow cloud create jkcluster -c "nextflow/nextflow.config" 40 && \
nextflow nf-core/chipseq -c "nextflow/nextflow.config" \
--singleEnd \
--seq_center 'DFCI' \
--email 'jkobject@gmail.com' \
--bucket-dir 'gs://jkobject/Chip_AML/nextflow/CHIPprocess_2/' \
--keyfile '~/jkobject-b6f1adaffcb8.json' \
--projectname 'jkobject' \
--zone 'us-east1-b' \
--skipDiffAnalysis \
--narrowPeak \
--design "nextflow/design.csv" \
--genome 'GRCh38' \
--profile gcp \
--resume \
--skipPreseq \
--max_cpus 8 && \
nextflow cloud shutdown jkclustert
!gsutil -m cp -r gs://amlproject/Chip/results/bwa/mergedLibrary/macs/narrowPeak/ ../data
!gsutil -m cp -r gs://amlproject/Chip/results/bwa/mergedLibrary/bigwig/ ../../data
!cp ../../data/narrowPeak/*MV411*.narrowPeak ../../data/MV4narrow
! mkdir ../../data/BroadPeaks/MV411 && mv ../../data/BroadPeaks/MV411_* ../../data/BroadPeaks/MV411/
bindings = chiphelper.loadPeaks('../../data/MV4narrow/', isMacs=False,skiprows=0)
broadbindings = chiphelper.loadPeaks('../../data/BroadPeaks/MV411/', isMacs=False,skiprows=0)
SEgenes = pd.read_csv('../data/SEgenes.csv')
CTF = pd.read_csv('../data/CTF.csv', header=None)[0].tolist()
CTF.extend(['GATA2','IKZF1','LYL1' ,'PU1','SMC1'])
CTF
CTF = list(set(CTF))
peaks = !ls ../../data/MV4narrow/*.narrowPeak
broadpeaks = ! ls ../../data/BroadPeaks/MV411/*.broadPeak
peaks = set([i.split('/')[-1].split('.')[0] for i in broadpeaks]) | set([i.split('/')[-1].split('.')[0] for i in peaks])
peaks
set(bindings['name'])
bindings
broadbindings
bindings = bindings[~bindings.name.isin(set(broadbindings.name))]
bindings = bindings.append(broadbindings)
len(bindings)
bindings['replicate']= [i.split('-')[-1][-1] for i in bindings['name']]
bindings['tf'] = [i.split('-')[2] for i in bindings['name']]
bindings['peak_number'] = ['_'.join([i.split('_')[2],i.split('_')[5]]) for i in bindings['peak_number']]
bindings
bindings.to_csv('../temp/bindings.bed',sep='\t',index=False)
bindings= pd.read_csv('../temp/bindings.bed',sep='\t',header=None, index_col=None,
names=["-log10pvalue","-log10qvalue", "chrom", "end", "foldchange", "name", "peak_number", "relative_summit_pos", "start", "replicate","tf"])
bindings
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()
gsheet
bw = ! ls ../../data/bigwig
bw
len(set(bindings.name))
len(bw)
# ONE off
for i in bw[2:]:
a = gsheet[gsheet.id=='mp'+i.split('_')[2]].name.values[0]
i = '../../data/bigwig/'+i
a = '../../data/bigwig/'+a+'.mLb.clN.bigWig'
! mv $i $a
print(a)
set(bindings.name)
replicates = chiphelper.findReplicates(folder='data/seqs/results/bwa/', sep='_', namings='_R([0-9])',namepos=0)
look at all t with a very low frip score as noted by encode.
look at all peaks tracks together and see for location of intense co binding.
Validate still but flag as potentially bad.
Else remove.
bad=[
"mp168",
"mp129",
"mp128",
"mp773",
"mp774",
"mp575",
"mp614",
"mp714",
"mp433",
"mp156",
"mp650",
"mp604",
"mp27",
"mp627",
"mp117",
"mp771",
"mp118",
"mp431",
"mp430",
"mp324",
"mp565",
"mp569",
"mp125",
"mp627",
"mp568",
"mp427",
"mp124",
"mp716",
"mp581",
"mp589",
"mp321",
"mp601",
"mp745",
"mp772",
"mp770",
"mp590",
"mp623",
"mp718"]
%matplotlib inline
mergedpeak, tomergebam, remove, ratiosofunique = chiphelper.mergeReplicatePeaks(bindings,'../../data/bigwig/',markedasbad=bad, window=150, mincov=4, doPlot=True, minKL=10, cov={}, use='poisson', MINOVERLAP=0.25,lookeverywhere=True, only='',saveloc='')
tomergebam
mergedpeak = mergedpeak[mergedpeak.columns[[2,9,3,5,6,4,0,1,7,10]]]
mergedpeak.to_csv('../temp/merged_replicates.csv')
mergedpeak = pd.read_csv('../temp/merged_replicates.csv', index_col=0)
bigwigs=os.listdir('../../data/bigwig/')
for val in bigwigs:
for v in remove + toremove + ['scale','POLII','IGG','CTCF','INPUT']:
if v in val:
bigwigs.remove(val)
break
bigwigs = ['data/bigwig/'+ i for i in bigwigs]
set(mergedpeak.tf)
mergedpeak.foldchange.min()
mergedpeak['name']=mergedpeak.tf
## Removing bad ChIP protein
mergedpeak = mergedpeak[~mergedpeak['name'].isin(['CDK13','GSE1'])]
merged = chiphelper.simpleMergePeaks(mergedpeak[~mergedpeak.tf.isin(['MED1','SMC1','CTCF','POLII','IRF2BP2_FLAG','IRF2BP2', 'H3K27ac', 'H3K27me3', 'H3K4me3', 'H3K79me2',])], window=150)
len(merged)
len(mergedpeak)
merged
merged.to_csv('../temp/merged.bed', sep='\t',index=None)
merged = pd.read_csv('../temp/merged.bed', sep='\t')
fig = sns.pairplot(merged[merged.columns[8:14]], corner=True, diag_kind="kde", kind="reg", plot_kws ={"scatter_kws":{"alpha":.05}})
def col_nan_scatter(x,y, **kwargs):
df = pd.DataFrame({'x':x[:],'y':y[:]})
df = df[df.sum(0)!=0]
x = df['x']
y = df['y']
plt.gca()
plt.scatter(x,y)
def col_nan_kde_histo(x, **kwargs):
df = pd.DataFrame({'x':x[:]})
df = df[df['x']!=0]
x = df['x']
plt.gca()
sns.kdeplot(x)
fig = fig.map_upper(col_nan_scatter)
fig = fig.map_upper(col_nan_kde_histo)
fig.savefig('../results/cobinding/pairplot_experiments.pdf')
plt.show()
counts,val = np.unique(merged[merged.columns[8:]].astype(bool).sum(1).values, return_counts=True)
fig = sns.barplot(data=pd.DataFrame(val, index=counts,columns=['counts']).T).set_yscale("log")
fig.savefig('../results/cobinding/pairplot_experiments.pdf')
plt.show()
i = merged[merged.columns[8:]].astype(bool).sum(1)
print(i.max(),i.mean(),i.min())
counts,val = np.unique(merged[merged.columns[8:]].astype(bool).sum(1).values, return_counts=True)
fig = sns.barplot(data=pd.DataFrame(val, index=counts,columns=['counts']).T).set_yscale("log")
len(merged)
we are evalutating each event's probability 1 binding, 2 binding, n binding.., as a binomial over the amount of proability p_i with n retries corresponding to the size of the conscensus peak set. the probability p_i of this binomial is the sum of probabilities of having tf a binding with b for all possible combination of tf. the number of combination is k amongst n, n being 33, k going from 1 to 29 we compute
$p(a & b) = p(a)\*p(b) =p(ab)$
and
$p(a & b) | p(a & c) = p(ab) + p(ac) - p(abc)$
for a,b,c,d:
$p(ab) + p(ac) + p(ad) + p(bc) + p(bd) + p(cd) - {3\choose 2}*(p(abc) - p(abd) - p(bcd) - p(acd)) - {4\choose 2}*p(abcd)$
proba = merged[merged.columns[8:]].astype(bool).sum(0)/len(merged)
sums = {i:0 for i in range(1,30)}
#sums = {i:0 for i in range(1,30)}
for i in range(29,0,-1):
print(i)
if sums[i]> 0:
continue
print(helper.combin(33,i))
for j in itertools.combinations(proba, i):
sums[i]+=np.prod(j)
sums = helper.fileToDict('sums.json')
sums
for i in range(29,0,-1):
for j in range(i+1,30):
icomb = helper.combin(j,i)
sums[str(i)] -= icomb*sums[str(j)]
sums
from scipy.stats import binom
for i in range(29,0,-1):
print(i,binom.mean(len(merged), sums[str(i)]),binom.var(len(merged), sums[str(i)]))
sums[str(i)] = [binom.mean(len(merged), sums[str(i)]),binom.var(len(merged), sums[str(i)])]
data = pd.DataFrame(sums).T.rename(columns={0:'mean',1:'var'})
merged[merged.columns[8:]].astype(bool).sum().sum()
(val*counts).sum()
int((data['mean'] * data.index.astype(int)).sum())
data.sum()
fig = sns.barplot(data=data.T).set_yscale("log")
val
res = pd.DataFrame()
res['change']=val/data['mean']
res['count']=list(res.index)
res
fig = sns.barplot(data=res.T).set_yscale("log").
res.T
fig = plt.bar(res['count'],res['change'],log=True)
plt.show()
fig = plt.bar(ares['count'],ares['change'],log=True)
plt.show()
sns.clustermap(np.corrcoef(stats.zscore(merged[merged.columns[8:]].values.T, axis=1)), figsize=(20, 20), xticklabels=merged.columns[8:], yticklabels=merged.columns[8:]).ax_col_dendrogram.set_visible(False)
merged[merged.columns[:8]].to_csv('../temp/conscensus.bed',sep='\t',index=None, columns=None)
cor = np.log2(1+cor)
a = cor.var(0).argsort()
subcor = cor[:,a[-TOTVAR:]]
subcor.shape
subcor = PCA(PCS).fit_transform(subcor)
%matplotlib inline
cmaps = ['Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn']
import seaborn as sns
from matplotlib import pyplot
fig, ax = pyplot.subplots(figsize=(20,15))
sns.heatmap(ax=ax, data=subcor,xticklabels=False, cmap=cmaps[-4],
yticklabels=[i.split('.')[-4].split('/')[-1] for i in bigwigs],
cbar_kws={"orientation": "horizontal"},
vmax=0.7).set_title('PCs of each binding profile over conscensus peakset')
#fig.savefig("temp/co_occupancy_matrix.png")
fig.savefig('PCs_binding_peakset.png')
I have tried gaussian mixtures and Agglomerative clustering algorithm. Only the second can create a hierarchical clustering.
It seems that gaussian mixture makes more sense given the data we have, for now, is more "homogeneous".
I am still not so happy with the clustering. It can be because of the how much importance, outlier values and the high number of noisy values from locations with no peaks.
We can use similar methods to RNAseq to improve this (clamping values, log transform, first round of PCA..)
labels = GaussianMixture(n_components=2, covariance_type='diag').fit_predict(subcor)
names = np.array([i.split('.')[-4].split('/')[-1] for i in bigwigs])
sort = labels.argsort()
p = helper.plotCorrelationMatrix(data=cor[sort],
names=names[sort],
colors=labels[sort],
title="correlation between TF occupancy",
interactive=True)
show(p)
p = helper.scatter(TSNE(2,5).fit_transform(subcor),labels=names, colors=labels)
show(p)
sns.clustermap(subcor)
additional = {}
additional['activation'] = chiphelper.simpleMergePeaks(mergedpeak[mergedpeak.tf.isin(["H3K27ac",'H3K79me2','H3K36me3','H3K4me3'])], window=10, mergedFold="max")
additional['repression'] = mergedpeak[mergedpeak.tf=='H3K27me3']
additional['IRF2BP2'] = mergedpeak[mergedpeak.tf=='IRF2BP2_FLAG']
additional['MED1'] = mergedpeak[mergedpeak.tf=='MED1']
additional['SMC1'] = mergedpeak[mergedpeak.tf=='SMC1']
additional['CTCF'] = mergedpeak[mergedpeak.tf=='CTCF']
additional['POLII'] = mergedpeak[mergedpeak.tf=='POLII']
for key, val in additional.items():
merged[key] = chiphelper.putInConscensus(merged[merged.columns[:8]],val)
merged = merged.replace(np.nan,0)
merged[key].astype(bool).sum()
# adding ATACseq
ATAC= chiphelper.loadPeaks(peakFile='../../data/ATACseq/ATAC_MV411.mRp.clN_peaks.broadPeak')
len(ATAC)
merged['ATAC'] = chiphelper.putInConscensus(merged[merged.columns[:8]],ATAC)
merged = merged.replace(np.nan,0)
merged['ATAC'].astype(bool).sum()
#compute enhancers at TSS in the matrix (promoters)
promoters = pd.read_csv('../data/human_epdnew_TeLy2.bed', sep='\t',header=None).rename(columns={0:'chrom',1:'start',2:'end',3:'name',5:'strand'}).drop(4,1)
promoters['foldchange']=1
promoters['name']=[i[:-2] for i in promoters['name']]
merged['promoters'] = chiphelper.putInConscensus(merged[merged.columns[:8]],promoters)
merged = merged.replace(np.nan,0)
merged['promoters'].astype(bool).sum()
set(bindings[bindings.tf=="H3K27ac"].name)
! mkdir ../../data/MV411_H3K27ac
! cp ../../data/MV4narrow/mp70-MV411-H3K27ac-r2.narrowPeak ../../data/MV411_H3K27ac
! cp ../../data/MV4narrow/mp734-MV411_DMSO-H3K27ac-r1.narrowPeak ../../data/MV411_H3K27ac
! cp ../../data/BroadPeaks/MV411/mp88-MV411-H3K27ac-r3.broadPeak ../../data/MV411_H3K27ac
! cp ../../data/BroadPeaks/MV411/mp702-MV411_DMSO-H3K27ac-r1.broadPeak ../../data/MV411_H3K27ac
! cp ../../data/BroadPeaks/MV411/mp183-MV411_DMSO-H3K27ac-r1.broadPeak ../../data/MV411_H3K27ac
! cp ../../data/BroadPeaks/MV411/mp136-MV411-H3K27ac-r1.broadPeak ../../data/MV411_H3K27ac
! gsutil cp gs://amlproject/Chip/results/bwa/mergedLibrary/*MV411*H3K27* MV411_H3K27ac/
peaks = [
"../../data/MV411_H3K27ac/mp70-MV411-H3K27ac-r2.narrowPeak",
"../../data/MV411_H3K27ac/mp734-MV411_DMSO-H3K27ac-r1.narrowPeak",
"../../data/MV411_H3K27ac/mp88-MV411-H3K27ac-r3.broadPeak",
"../../data/MV411_H3K27ac/mp702-MV411_DMSO-H3K27ac-r1.broadPeak",
"../../data/MV411_H3K27ac/mp183-MV411_DMSO-H3K27ac-r1.broadPeak",
"../../data/MV411_H3K27ac/mp136-MV411-H3K27ac-r1.broadPeak"
]
for val in peaks:
valbed = val +".bed"
! mv $val $valbed
peaks[1:]
for peak in peaks[1:]:
chiphelper.MakeSuperEnhancers(peak+'.bed',
bamFile='.'.join(peak.split('.')[:-1])+'.mLb.clN.sorted.bam',
baiFile='.'.join(peak.split('.')[:-1])+'.mLb.clN.sorted.bam.bai',
controlBam= '../../data/diffBinding_hist/INPUT_R1.mLb.clN.sorted.bam',
controlBai= '../../data/diffBinding_hist/INPUT_R1.mLb.clN.sorted.bam.bai',
outdir ='~/data/MV411_H3K27ac',
rosePath="../src/ROSE/")
! rm ~/data/MV411_H3K27ac/*.bam*
## loading and merging
! mkdir ../results/ROSE/MV411/
! cp ~/data/MV411_H3K27ac/*.txt ../results/ROSE/MV411/
! cp ~/data/MV411_H3K27ac/*.bed ../results/ROSE/MV411/
! cp ~/data/MV411_H3K27ac/*.png ../results/ROSE/MV411/
rose = chiphelper.ReadRoseSuperEnhancers("../results/ROSE/MV411/")
rose = chiphelper.simpleMergePeaks(rose,window=1000).drop(columns=["relative_summit_pos","-log10pvalue","-log10qvalue"])
rose = rose[rose[rose.columns[5:]].astype(bool).sum(1)>1]
rose = rose.sort_values(by=['chrom','start','end']).reset_index(drop=True)
merged['super_enhancer'] = chiphelper.putInConscensus(merged[merged.columns[:8]],rose)
merged = merged.replace(np.nan,0)
merged['super_enhancer'].astype(bool).sum()
## loading super enhancer from max's files
## comparing
## making regulat enhancers merged["regular_enhancers"]
merged['regular_enhancer'] = (merged['activation'].astype(bool) & ~merged[['super_enhancer','promoters']].astype(bool).sum(1).astype(bool)).astype(float)
merged.to_csv('../temp/merged.bed', sep='\t',index=None)
merged = pd.read_csv('../temp/merged.bed', sep='\t')
Look at AUC for all ChIPs over all peaks of all ChIPs
rand = np.random.choice(merged.index,5000)
viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[-11:]]
for val in data.columns:
data[val] =stats.zscore(np.log2(1+data[val]))
data = (((data -data.min(0))/ (data.max(0)))*256).astype(int)
data = data.loc[rand]
for val in data.columns:
a = [viridis(v) for v in data[val]]
data[val] = a
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[8:-7]].loc[rand].T),col_cluster=False, z_score=0, vmin=0,vmax=3, col_colors = data, figsize=(30,20),xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../temp/clustermap_cobinding_scaled_marks.pdf')
plt.show()
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[8:-11]].loc[rand].T), vmin=0,vmax=3,figsize=(20,15),z_score=0,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../temp/clustermap_cobinding_scaled_marks.pdf')
plt.show()
How many of peak in A (column) overlaps with peak in B (rows)
in other words:
what is the percentage of B's peaks that are overlaped by A's peaks
merged[merged.columns[8:]].sum()
data = pd.DataFrame(stats.zscore(0.01+merged[merged.columns[8:]]).T, columns=merged.index, index=merged.columns[8:])
link = linkage(data[:-11])
col = data[-11:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
for val in col.columns:
col[val] = [viridis(v) for v in col[val]]
fig = sns.clustermap(np.corrcoef(data.iloc[:-11])[data.columns[np.concatenate((leaves_list(link),[26,27,28,29,30,31,32,33,34,35,36]))]], row_linkage=link, col_colors=col.T, col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/cobinding/correlation_with_annotation.pdf')
plt.show()
merged['regular_enhancer'] = merged['regular_enhancer'].astype(float)
correlation, overlap = chiphelper.computeOverlapCorrelation(merged, norm=True)
data = pd.DataFrame(data=overlap,index=merged.columns[8:], columns=merged.columns[8:])
link = linkage(data.iloc[:-11]) # D being the measurement
col = data[-11:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
for val in col.columns:
a = [viridis(v) for v in col[val]]
col[val] = a
fig = sns.clustermap(data.iloc[:-11][data.columns[np.concatenate((leaves_list(link),[26,27,28,29,30,31,32,33,34,35,36]))]], row_linkage=link, col_colors=col.T, col_cluster=False,figsize=(12,12))
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/cobinding/overlap.pdf')
plt.show()
pd.concat([merged[merged.columns[8:]].astype(bool).sum(0),
merged[merged.columns[8:]].max(),
merged[merged.columns[8:]].replace(0, np.NaN).mean(),
merged[merged.columns[8:]].replace(0, np.NaN).var()],axis=1).rename(columns={0:'sum',1:'max',2:'mean',3:'std'})
on the overlaps given above
data = pd.DataFrame(data=correlation,index=merged.columns[8:], columns=merged.columns[8:])
link = linkage(data.iloc[:-11]) # D being the measurement
col = data.iloc[-11:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
for val in col.columns:
a = [viridis(v) for v in col[val]]
col[val] = a
fig = sns.clustermap(data.iloc[:-11][data.columns[np.concatenate((leaves_list(link),[26,27,28,29,30,31,32,33,34,35,36]))]], row_linkage=link, col_colors=col.T, col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig("../results/cobinding/correlation_onoverlap.pdf")
plt.show()
data = pd.DataFrame(data=np.corrcoef(stats.zscore(merged[merged.columns[8:]]).T), index=merged.columns[8:], columns=merged.columns[8:])
link = linkage(data.iloc[:-11]) # D being the measurement
col = data.iloc[-11:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
for val in col.columns:
a = [viridis(v) for v in col[val]]
col[val] = a
fig = sns.clustermap(data.iloc[:-11][data.columns[np.concatenate((leaves_list(link),[26,27,28,29,30,31,32,33,34,35,36]))]], row_linkage=link, col_colors=col.T, col_cluster=False,figsize=(12,12))
fig.ax_col_dendrogram.set_visible(False)
fig.savefig("../results/cobinding/correlation_onoverlap.pdf")
plt.show()
chiphelper.filterOnEnrichment(,toremove=[""])
chiphelper.AssignToClosestExpressed()

They tested a new model based on and validated by CRISPRi-FlowFISH which is basically able to find enhancer mapping to genes. They used it to compute their model's Accuracy and found a 70% accuracy compared to less than 50% for closest expressed gene.
Way to integrate our HiC data (need ATAC-seq like data as well, but openly available)

Helper.scatter(TSNE(2,5).fit_transform(data.T), labels=zones.columns[11:],colors=labels)
~1500 lines of code. Seems to be a slightly updated version from the code I used the first time which was from their originial CRC paper.
There is not a lot of documentation but it was just updated last week and might continue like that

we need to redo everything for similar normal cell type, getting TFs based on the CRC (find it with CRCmapper or on litterature)